@@ -36,7 +36,7 @@ module Agents |
||
| 36 | 36 |
|
| 37 | 37 |
Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent (only used to set the "working" status). |
| 38 | 38 |
|
| 39 |
- Set `uniqueness_look_back` (defaults to 10000) to limit the number of events checked for uniqueness (typically for performance). |
|
| 39 |
+ Set `uniqueness_look_back` (defaults to the larger of 200, 3x the number of received events) to limit the number of events checked for uniqueness (typically for performance). |
|
| 40 | 40 |
MD |
| 41 | 41 |
|
| 42 | 42 |
event_description do |
@@ -45,7 +45,8 @@ module Agents |
||
| 45 | 45 |
|
| 46 | 46 |
default_schedule "every_12h" |
| 47 | 47 |
|
| 48 |
- UNIQUENESS_LOOK_BACK = 10000 |
|
| 48 |
+ UNIQUENESS_LOOK_BACK = 200 |
|
| 49 |
+ UNIQUENESS_FACTOR = 3 |
|
| 49 | 50 |
|
| 50 | 51 |
def working? |
| 51 | 52 |
event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs? |
@@ -65,10 +66,32 @@ module Agents |
||
| 65 | 66 |
end |
| 66 | 67 |
|
| 67 | 68 |
def validate_options |
| 69 |
+ # Check required fields are present |
|
| 68 | 70 |
errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present? |
| 69 | 71 |
if !options['extract'].present? && extraction_type != "json" |
| 70 | 72 |
errors.add(:base, "extract is required for all types except json") |
| 71 | 73 |
end |
| 74 |
+ # Check options: |
|
| 75 |
+ if options['mode'].present? |
|
| 76 |
+ if options['mode'] != "on_change" && options['mode'] != "all" |
|
| 77 |
+ errors.add(:base, "mode should be all or on_change") |
|
| 78 |
+ end |
|
| 79 |
+ end |
|
| 80 |
+ # Check integer variables: |
|
| 81 |
+ if options['expected_update_period_in_days'].present? |
|
| 82 |
+ begin |
|
| 83 |
+ Integer(options['expected_update_period_in_days']) |
|
| 84 |
+ rescue |
|
| 85 |
+ errors.add(:base, "Invalid expected_update_period_in_days format") |
|
| 86 |
+ end |
|
| 87 |
+ end |
|
| 88 |
+ if options['uniqueness_look_back'].present? |
|
| 89 |
+ begin |
|
| 90 |
+ Integer(options['uniqueness_look_back']) |
|
| 91 |
+ rescue |
|
| 92 |
+ errors.add(:base, "Invalid uniqueness_look_back format") |
|
| 93 |
+ end |
|
| 94 |
+ end |
|
| 72 | 95 |
end |
| 73 | 96 |
|
| 74 | 97 |
def check |
@@ -84,9 +107,9 @@ module Agents |
||
| 84 | 107 |
end |
| 85 | 108 |
request.on_success do |response| |
| 86 | 109 |
doc = parse(response.body) |
| 87 |
- old_events = previous_payloads |
|
| 88 | 110 |
|
| 89 | 111 |
if extract_full_json? |
| 112 |
+ old_events = previous_payloads 1 |
|
| 90 | 113 |
result = doc |
| 91 | 114 |
if store_payload? old_events, result |
| 92 | 115 |
log "Storing new result for '#{name}': #{result.inspect}"
|
@@ -119,6 +142,7 @@ module Agents |
||
| 119 | 142 |
return |
| 120 | 143 |
end |
| 121 | 144 |
|
| 145 |
+ old_events = previous_payloads num_unique_lengths.first |
|
| 122 | 146 |
num_unique_lengths.first.times do |index| |
| 123 | 147 |
result = {}
|
| 124 | 148 |
options['extract'].keys.each do |name| |
@@ -159,8 +183,16 @@ module Agents |
||
| 159 | 183 |
raise "Illegal options[mode]: " + options['mode'].to_s |
| 160 | 184 |
end |
| 161 | 185 |
|
| 162 |
- def previous_payloads |
|
| 163 |
- look_back = options['uniqueness_look_back'] ? options['uniqueness_look_back'].to_i : UNIQUENESS_LOOK_BACK |
|
| 186 |
+ def previous_payloads(num_events) |
|
| 187 |
+ if options['uniqueness_look_back'].present? |
|
| 188 |
+ look_back = options['uniqueness_look_back'].to_i |
|
| 189 |
+ else |
|
| 190 |
+ # Larger of UNIQUENESS_FACTOR*num_events and UNIQUENESS_LOOK_BACK |
|
| 191 |
+ look_back = UNIQUENESS_FACTOR*num_events |
|
| 192 |
+ if look_back < UNIQUENESS_LOOK_BACK |
|
| 193 |
+ look_back = UNIQUENESS_LOOK_BACK |
|
| 194 |
+ end |
|
| 195 |
+ end |
|
| 164 | 196 |
events.order("id desc").limit(look_back) if options['mode'].to_s == "on_change"
|
| 165 | 197 |
end |
| 166 | 198 |
|
@@ -21,6 +21,18 @@ describe Agents::WebsiteAgent do |
||
| 21 | 21 |
end |
| 22 | 22 |
|
| 23 | 23 |
describe "#check" do |
| 24 |
+ |
|
| 25 |
+ it "should validate the integer fields" do |
|
| 26 |
+ @checker.options['expected_update_period_in_days'] = "nonsense" |
|
| 27 |
+ lambda { @checker.save! }.should raise_error;
|
|
| 28 |
+ @checker.options['expected_update_period_in_days'] = "2" |
|
| 29 |
+ @checker.options['uniqueness_look_back'] = "nonsense" |
|
| 30 |
+ lambda { @checker.save! }.should raise_error;
|
|
| 31 |
+ @checker.options['mode'] = "nonsense" |
|
| 32 |
+ lambda { @checker.save! }.should raise_error;
|
|
| 33 |
+ @checker.options = @site |
|
| 34 |
+ end |
|
| 35 |
+ |
|
| 24 | 36 |
it "should check for changes (and update Event.expires_at)" do |
| 25 | 37 |
lambda { @checker.check }.should change { Event.count }.by(1)
|
| 26 | 38 |
event = Event.last |
@@ -107,7 +119,7 @@ describe Agents::WebsiteAgent do |
||
| 107 | 119 |
'expected_update_period_in_days' => 2, |
| 108 | 120 |
'type' => "html", |
| 109 | 121 |
'url' => "http://xkcd.com", |
| 110 |
- 'mode' => :on_change, |
|
| 122 |
+ 'mode' => "on_change", |
|
| 111 | 123 |
'extract' => {
|
| 112 | 124 |
'url' => {'css' => "#topLeft a", 'attr' => "href"},
|
| 113 | 125 |
'title' => {'css' => "#topLeft a", 'text' => "true"}
|